
//Chris' WinGlide enhancements
//Copr. 1998, Chris Dohnal (cwdohnal@ucdavis.edu)

#include <windows.h>
#include <mmsystem.h>

BOOL AllocateGammaCorrectionTable(VOID);
VOID FreeGammaCorrectionTable(VOID);
DWORD *GetGammaCorrectionTablePointer(VOID);
BOOL CalculateGammaCorrectionTable(float, float, float);
WORD Pack565RGB(DWORD, DWORD, DWORD);
BOOL CalculateColorConversionTable(FOURCC, float, float, float);
DWORD PackFourDword(DWORD, DWORD, DWORD, DWORD);
VOID GammaPitchCopy8(BYTE *, BYTE *, LONG, LONG, LONG, LONG, DWORD *);
VOID MMXGammaPitchCopy8(BYTE *, BYTE *, LONG, LONG, LONG, LONG, DWORD *);
VOID ColorConvertPitchCopy8(BYTE *, BYTE *, LONG, LONG, LONG, LONG, DWORD *);
VOID MMXColorConvertPitchCopy8(BYTE *, BYTE *, LONG, LONG, LONG, LONG, DWORD *);

//Enough room for 2^16 32 bit aligned two byte entries
#define GAMMA_CORRECTION_TABLE_SIZE		262144

DWORD *pGammaCorrectionTable = NULL;

BOOL AllocateGammaCorrectionTable(VOID) {
	//Free the gamma correction table if it already exists
	if (pGammaCorrectionTable != NULL) {
		FreeGammaCorrectionTable();
	}

	//Allocate memory for the gamma correction table
	pGammaCorrectionTable = GlobalAlloc(GMEM_FIXED, GAMMA_CORRECTION_TABLE_SIZE);
	if (pGammaCorrectionTable == NULL) {
		return FALSE;
	}

	return TRUE;
}

VOID FreeGammaCorrectionTable(VOID) {
	//Free the memory used for the gamma correction table
	if (pGammaCorrectionTable != NULL) {
		GlobalFree(pGammaCorrectionTable);
		pGammaCorrectionTable = NULL;
	}
	return;
}

DWORD *GetGammaCorrectionTablePointer(VOID) {
	return pGammaCorrectionTable;
}

BOOL CalculateGammaCorrectionTable(float redGamma, float greenGamma, float blueGamma) {
	DWORD red, green, blue;
	double oneOverRedGamma, oneOverGreenGamma, oneOverBlueGamma;
	DWORD gcRedArray[32], gcGreenArray[64], gcBlueArray[32];

	//Return failure if the gamma correction table has not been allocated
	if (pGammaCorrectionTable == NULL) {
		return FALSE;
	}

	//Precalculate some information
	if (redGamma == 0.0f) {
		oneOverRedGamma = 1.0f;
	}
	else {
		oneOverRedGamma = (double)(1.0f / redGamma);
	}
	if (greenGamma == 0.0f) {
		oneOverGreenGamma = 1.0f;
	}
	else {
		oneOverGreenGamma = (double)(1.0f / greenGamma);
	}
	if (blueGamma == 0.0f) {
		oneOverBlueGamma = 1.0f;
	}
	else {
		oneOverBlueGamma = (double)(1.0f / blueGamma);
	}

	for (red = 0; red < 32; red++) {
		float fResult;
		DWORD gcRed;

		if (redGamma == 0.0f) {
			gcRed = 0;
		}
		else {
			fResult = (float)(pow((double)((float)red / 31.0f), oneOverRedGamma) * 31.0);
			gcRed = (DWORD)fResult;
			if (gcRed > 31) {
				gcRed = 31;
			}
			if (gcRed < 0) {
				gcRed = 0;
			}
		}

		gcRedArray[red] = gcRed;
	}
	for (green = 0; green < 64; green++) {
		float fResult;
		DWORD gcGreen;

		if (greenGamma == 0.0f) {
			gcGreen = 0;
		}
		else {
			fResult = (float)(pow((double)((float)green / 63.0f), oneOverGreenGamma) * 63.0);
			gcGreen = (DWORD)fResult;
			if (gcGreen > 63) {
				gcGreen = 63;
			}
			if (gcGreen < 0) {
				gcGreen = 0;
			}
		}

		gcGreenArray[green] = gcGreen;
	}
	for (blue = 0; blue < 32; blue++) {
		float fResult;
		DWORD gcBlue;

		if (blueGamma == 0.0f) {
			gcBlue = 0;
		}
		else {
			fResult = (float)(pow((double)((float)blue / 31.0f), oneOverBlueGamma) * 31.0);
			gcBlue = (DWORD)fResult;
			if (gcBlue > 31) {
				gcBlue = 31;
			}
			if (gcBlue < 0) {
				gcBlue = 0;
			}
		}

		gcBlueArray[blue] = gcBlue;
	}

	//Build the gamma correction table
	for (red = 0; red < 32; red++) {
		for (green = 0; green < 64; green++) {
			for (blue = 0; blue < 32; blue++) {
				DWORD gcRed, gcGreen, gcBlue;
				WORD sourceColor;
				WORD gammaCorrectedColor;

				//Calculate the gamma corrected RGB values
				gcRed = gcRedArray[red];
				gcGreen = gcGreenArray[green];
				gcBlue = gcBlueArray[blue];
				
				//Pack the three source colors into the 16 bit 5-6-5 color format
				sourceColor = Pack565RGB(red, green, blue);

				//Pack the three gamma corrected colors into the 16 bit 5-6-5 color format
				gammaCorrectedColor = Pack565RGB(gcRed, gcGreen, gcBlue);

				//Add the entry to the gamma correction table
				pGammaCorrectionTable[sourceColor] = (DWORD)gammaCorrectedColor;
			}
		}
	}

	return TRUE;
}
	
WORD Pack565RGB(DWORD red, DWORD green, DWORD blue) {
	WORD color;

	color = 0;
	color |= (red & 31) << 11;
	color |= (green & 63) << 5;
	color |= (blue & 31);

	return color;
}

BOOL CalculateColorConversionTable(FOURCC fourCC, float redGamma, float greenGamma, float blueGamma) {
	DWORD red, green, blue;
	double oneOverRedGamma, oneOverGreenGamma, oneOverBlueGamma;
	float gcfRedArray[32], gcfGreenArray[64], gcfBlueArray[32];

	//Return failure if the gamma correction table has not been allocated
	if (pGammaCorrectionTable == NULL) {
		return FALSE;
	}

	//Precalculate some information
	if (redGamma == 0.0f) {
		oneOverRedGamma = 1.0f;
	}
	else {
		oneOverRedGamma = (double)(1.0f / redGamma);
	}
	if (greenGamma == 0.0f) {
		oneOverGreenGamma = 1.0f;
	}
	else {
		oneOverGreenGamma = (double)(1.0f / greenGamma);
	}
	if (blueGamma == 0.0f) {
		oneOverBlueGamma = 1.0f;
	}
	else {
		oneOverBlueGamma = (double)(1.0f / blueGamma);
	}

	for (red = 0; red < 32; red++) {
		float gcfRed;

		if (redGamma == 0.0f) {
			gcfRed = 0.0f;
		}
		else {
			gcfRed = (float)(pow((double)((float)red / 31.0f), oneOverRedGamma) * 255.0);
		}

		gcfRedArray[red] = gcfRed;
	}
	for (green = 0; green < 64; green++) {
		float gcfGreen;

		if (greenGamma == 0.0f) {
			gcfGreen = 0.0f;
		}
		else {
			gcfGreen = (float)(pow((double)((float)green / 63.0f), oneOverGreenGamma) * 255.0);
		}

		gcfGreenArray[green] = gcfGreen;
	}
	for (blue = 0; blue < 32; blue++) {
		float gcfBlue;

		if (blueGamma == 0.0f) {
			gcfBlue = 0.0f;
		}
		else {
			gcfBlue = (float)(pow((double)((float)blue / 31.0f), oneOverBlueGamma) * 255.0);
		}

		gcfBlueArray[blue] = gcfBlue;
	}

	//Build the color conversion table
	for (red = 0; red < 32; red++) {
		for (green = 0; green < 64; green++) {
			for (blue = 0; blue < 32; blue++) {
				float fY, fU, fV;
				float gcfRed, gcfGreen, gcfBlue;
				DWORD gcY, gcU, gcV;
				WORD sourceColor;
				DWORD convertedColor;

				//Calculate the gamma corrected RGB values
				gcfRed = gcfRedArray[red];
				gcfGreen = gcfGreenArray[green];
				gcfBlue = gcfBlueArray[blue];

				//Convert from RGB to YUV
				fY = 0.257f * gcfRed + 0.504f * gcfGreen + 0.098f * gcfBlue + 16.0f;
				fU = -0.148f * gcfRed - 0.291f * gcfGreen + 0.439f * gcfBlue + 128.0f;
				fV = 0.439f * gcfRed - 0.368f * gcfGreen - 0.071f * gcfBlue + 128.0f;
				
				gcY = (LONG)fY;
				if (gcY > 255) {
					gcY = 255;
				}
				if (gcY < 0) {
					gcY = 0;
				}

				gcU = (LONG)fU;
				if (gcU > 255) {
					gcU = 255;
				}
				if (gcU < 0) {
					gcU = 0;
				}

				gcV = (LONG)fV;
				if (gcV > 255) {
					gcV = 255;
				}
				if (gcV < 0) {
					gcV = 0;
				}


				//Pack the three source colors into the 16 bit 5-6-5 color format
				sourceColor = Pack565RGB(red, green, blue);

				//Pack the YUV data
				convertedColor = PackFourDword(gcV, gcY, gcU, gcY);

				//Add the entry to the gamma correction table
				pGammaCorrectionTable[sourceColor] = convertedColor;
			}
		}
	}

	return TRUE;
}

DWORD PackFourDword(DWORD dw1, DWORD dw2, DWORD dw3, DWORD dw4) {
	DWORD dwRet;

	dwRet = (dw1 & 0xFF) << 24;
	dwRet |= (dw2 & 0xFF) << 16;
	dwRet |= (dw3 & 0xFF) << 8;
	dwRet |= (dw4 & 0xFF);

	return dwRet;
}

__declspec(naked) VOID GammaPitchCopy8(BYTE *pSource, BYTE *pDest, LONG lSourcePitch, LONG lDestPitch, LONG lXCount8, LONG lYCount, DWORD *pGammaTable) {
	__asm {
		//Prolog
		push ebp
		push ebx
		push esi
		push edi

		//Get the number of scanlines to copy
		mov ebx, [esp + 24 + 16]	//lYCount

		//Get lXCount8
		mov ecx, [esp + 20 + 16]	//lXCount8

		//Make sure lYCount > 0
		test ebx, ebx
		jle epilog

		//Make sure lXCount8 > 0
		test ecx, ecx
		jle epilog

		//Get the source and destination pointers
		mov esi, [esp + 4 + 16]		//pSource
		mov edi, [esp + 8 + 16]		//pDest

		//Get the address of the gamma correction table
		mov ebp, [esp + 28 + 16]	//pGammaTable

		//Calculate lExtraSourcePitch and lExtraDestPitch
		//lExtraSourcePitch = lSourcePitch - (lXCount8 << 3);
		//lExtraDestPitch = lDestPitch - (lXCount8 << 3);
		shl ecx, 3
		mov eax, [esp + 12 + 16]	//lSourcePitch
		mov edx, [esp + 16 + 16]	//lDestPitch
		sub eax, ecx
		sub edx, ecx

		//Push lExtraSourcePitch and lExtraDestPitch onto the stack
		push edx					//lExtraDestPitch
		push eax					//lExtraSourcePitch

		ycopyloop:
			//Get the number of eight byte pieces in each scanline
			mov ecx, [esp + 20 + 16 + 8]	//lXCount8

			//Save ebx
			push ebx

			xcopyloop:
				mov eax, dword ptr [esi]
				mov ebx, dword ptr [esi + 4]
				
				push esi
				mov esi, eax

				push edi
				and esi, 0xFFFF

				shr eax, 16				
				mov edi, ebx
				
				shr ebx, 16
				mov edx, [ebp + esi*4]

				and edi, 0xFFFF
				mov esi, [ebp + eax*4]

				shl esi, 16
				mov eax, [ebp + ebx*4]
				
				shl eax, 16
				mov ebx, [ebp + edi*4]

				or edx, esi
				pop edi

				or ebx, eax
				pop esi

				mov dword ptr [edi], edx
				mov dword ptr [edi + 4], ebx
								
				add esi, 8
				add edi, 8

				dec ecx
				jnz xcopyloop

			//Restore ebx
			pop ebx
		
			//Calculate the address of the next scanline
			add esi, [esp]			//lExtraSourcePitch
			add edi, [esp + 4]		//lExtraDestPitch

			dec ebx
			jnz ycopyloop

		//Pop lExtraSourcePitch and lExtraDestPitch off the stack
		pop eax					//lExtraSourcePitch
		pop edx					//lExtraDestPitch

		//Epilog
	epilog:
		pop edi
		pop esi
		pop ebx
		pop ebp
		ret 28
	}
}

__declspec(naked) VOID MMXGammaPitchCopy8(BYTE *pSource, BYTE *pDest, LONG lSourcePitch, LONG lDestPitch, LONG lXCount8, LONG lYCount, DWORD *pGammaTable) {
	__asm {
		//Prolog
		push ebp
		push ebx
		push esi
		push edi

		//Get the number of scanlines to copy
		mov ebx, [esp + 24 + 16]	//lYCount

		//Get lXCount8
		mov ecx, [esp + 20 + 16]	//lXCount8

		//Make sure lYCount > 0
		test ebx, ebx
		jle epilog

		//Make sure lXCount8 > 0
		test ecx, ecx
		jle epilog

		//Get the source and destination pointers
		mov esi, [esp + 4 + 16]		//pSource
		mov edi, [esp + 8 + 16]		//pDest
		
		//Subtract 8 from edi so it can be increased by 8 before moving data
		sub edi, 8

		//Get the address of the gamma correction table
		mov ebp, [esp + 28 + 16]	//pGammaTable

		//Calculate lExtraSourcePitch and lExtraDestPitch
		//lExtraSourcePitch = lSourcePitch - (lXCount8 << 3);
		//lExtraDestPitch = lDestPitch - (lXCount8 << 3);
		shl ecx, 3
		mov eax, [esp + 12 + 16]	//lSourcePitch
		mov edx, [esp + 16 + 16]	//lDestPitch
		sub eax, ecx
		sub edx, ecx

		//Push lExtraSourcePitch and lExtraDestPitch onto the stack
		push edx					//lExtraDestPitch
		push eax					//lExtraSourcePitch

		//Put all zeros in mm7
		pxor mm7, mm7

		ycopyloop:
			//Get the number of eight byte pieces in each scanline
			mov ecx, [esp + 20 + 16 + 8]	//lXCount8

			xcopyloop:
				movq mm0, qword ptr [esi]

				movq mm1, mm0
				punpcklwd mm0, mm7

				movd eax, mm0
				psrlq mm0, 32

				movd edx, mm0
				punpckhwd mm1, mm7

				movd mm0, dword ptr [ebp + eax*4]
				add esi, 8

				movd mm2, dword ptr [ebp + edx*4]
				add edi, 8

				movd eax, mm1
				psrlq mm1, 32

				movd edx, mm1

				movd mm1, dword ptr [ebp + eax*4]
				punpcklwd mm0, mm2

				movd mm2, dword ptr [ebp + edx*4]

				punpcklwd mm1, mm2

				punpckldq mm0, mm1
				dec ecx

				movq qword ptr [edi], mm0
				jnz xcopyloop

			//Calculate the address of the next scanline
			add esi, [esp]			//lExtraSourcePitch
			add edi, [esp + 4]		//lExtraDestPitch

			dec ebx
			jnz ycopyloop

		//Pop lExtraSourcePitch and lExtraDestPitch off the stack
		pop eax					//lExtraSourcePitch
		pop edx					//lExtraDestPitch

		//Emtpy the MMX state
		emms

		//Epilog
	epilog:
		pop edi
		pop esi
		pop ebx
		pop ebp
		ret 28
	}
}

__declspec(naked) VOID ColorConvertPitchCopy8(BYTE *pSource, BYTE *pDest, LONG lSourcePitch, LONG lDestPitch, LONG lXCount8, LONG lYCount, DWORD *pGammaTable) {
	static DWORD lowByteMask = 0x00FF00FF;
	static DWORD highByteMask = 0xFF00FF00;
	static DWORD firstByteMask = 0x000000FF;
	static DWORD thirdByteMask = 0x00FF0000;

	__asm {
		//Prolog
		push ebp
		push ebx
		push esi
		push edi

		//Get the number of scanlines to copy
		mov ebx, [esp + 24 + 16]	//lYCount

		//Get lXCount8
		mov ecx, [esp + 20 + 16]	//lXCount8

		//Make sure lYCount > 0
		test ebx, ebx
		jle epilog

		//Make sure lXCount8 > 0
		test ecx, ecx
		jle epilog

		//Get the source and destination pointers
		mov esi, [esp + 4 + 16]		//pSource
		mov edi, [esp + 8 + 16]		//pDest

		//Get the address of the gamma correction table
		mov ebp, [esp + 28 + 16]	//pGammaTable

		//Calculate lExtraSourcePitch and lExtraDestPitch
		//lExtraSourcePitch = lSourcePitch - (lXCount8 << 3);
		//lExtraDestPitch = lDestPitch - (lXCount8 << 3);
		shl ecx, 3
		mov eax, [esp + 12 + 16]	//lSourcePitch
		mov edx, [esp + 16 + 16]	//lDestPitch
		sub eax, ecx
		sub edx, ecx

		//Push lExtraSourcePitch and lExtraDestPitch onto the stack
		push edx					//lExtraDestPitch
		push eax					//lExtraSourcePitch

		ycopyloop:
			//Get the number of eight byte pieces in each scanline
			mov ecx, [esp + 20 + 16 + 8]	//lXCount8

			//Save ebx
			push ebx

			xcopyloop:
				mov eax, dword ptr [esi]
				mov ebx, dword ptr [esi + 4]

				push esi
				mov esi, eax

				push edi
				and esi, 0xFFFF

				shr eax, 16				
				mov edi, ebx
				
				shr ebx, 16
				mov edx, [ebp + esi*4]

				and edi, 0xFFFF
				mov esi, [ebp + eax*4]

				push edx
				and edx, highByteMask

				push esi
				and esi, highByteMask

				add edx, esi
				pop esi

				rcr edx, 1
				pop eax

				and esi, thirdByteMask
				and eax, firstByteMask

				or eax, esi
				and edx, highByteMask

				or edx, eax
				mov eax, [ebp + ebx*4]
				
				mov esi, eax
				mov ebx, [ebp + edi*4]

				and eax, highByteMask
				mov edi, ebx

				and ebx, highByteMask
				and esi, thirdByteMask

				and edi, firstByteMask
				add ebx, eax

				rcr ebx, 1
				or esi, edi

				and ebx, highByteMask
				pop edi

				or ebx, esi
				pop esi

				mov dword ptr [edi], edx
				mov dword ptr [edi + 4], ebx
								
				add esi, 8
				add edi, 8

				dec ecx
				jnz xcopyloop

			//Restore ebx
			pop ebx
		
			//Calculate the address of the next scanline
			add esi, [esp]			//lExtraSourcePitch
			add edi, [esp + 4]		//lExtraDestPitch

			dec ebx
			jnz ycopyloop

		//Pop lExtraSourcePitch and lExtraDestPitch off the stack
		pop eax					//lExtraSourcePitch
		pop edx					//lExtraDestPitch

		//Epilog
	epilog:
		pop edi
		pop esi
		pop ebx
		pop ebp
		ret 28
	}
}

__declspec(naked) VOID MMXColorConvertPitchCopy8(BYTE *pSource, BYTE *pDest, LONG lSourcePitch, LONG lDestPitch, LONG lXCount8, LONG lYCount, DWORD *pGammaTable) {
	static DWORDLONG lowByteMask = 0x00FF00FF00FF00FF;
	static DWORDLONG highByteMask = 0xFF00FF00FF00FF00;

	__asm {
		//Prolog
		push ebp
		push ebx
		push esi
		push edi

		//Get the number of scanlines to copy
		mov ebx, [esp + 24 + 16]	//lYCount

		//Get lXCount8
		mov ecx, [esp + 20 + 16]	//lXCount8

		//Make sure lYCount > 0
		test ebx, ebx
		jle epilog

		//Make sure lXCount8 > 0
		test ecx, ecx
		jle epilog

		//Get the source and destination pointers
		mov esi, [esp + 4 + 16]		//pSource
		mov edi, [esp + 8 + 16]		//pDest
		
		//Subtract 8 from edi so it can be increased by 8 before moving data
		sub edi, 8

		//Get the address of the gamma correction table
		mov ebp, [esp + 28 + 16]	//pGammaTable

		//Calculate lExtraSourcePitch and lExtraDestPitch
		//lExtraSourcePitch = lSourcePitch - (lXCount8 << 3);
		//lExtraDestPitch = lDestPitch - (lXCount8 << 3);
		shl ecx, 3
		mov eax, [esp + 12 + 16]	//lSourcePitch
		mov edx, [esp + 16 + 16]	//lDestPitch
		sub eax, ecx
		sub edx, ecx

		//Push lExtraSourcePitch and lExtraDestPitch onto the stack
		push edx					//lExtraDestPitch
		push eax					//lExtraSourcePitch

		//Put all zeros in mm7
		pxor mm7, mm7

		//Load bitmasks
		movq mm5, lowByteMask
		movq mm6, highByteMask

		ycopyloop:
			//Get the number of eight byte pieces in each scanline
			mov ecx, [esp + 20 + 16 + 8]	//lXCount8

			xcopyloop:
				movq mm0, qword ptr [esi]
				add edi, 8

				movq mm1, mm0
				punpcklwd mm0, mm7

				movd eax, mm0
				psrlq mm0, 32

				movd edx, mm0
				punpckhwd mm1, mm7

				movd mm0, dword ptr [ebp + eax*4]
				add esi, 8

				movd mm2, dword ptr [ebp + edx*4]
				movq mm3, mm0

				movd eax, mm1
				punpcklbw mm0, mm7

				psrlq mm1, 32
				punpcklwd mm3, mm2

				movd edx, mm1
				punpcklbw mm2, mm7

				movd mm1, dword ptr [ebp + eax*4]
				paddw mm0, mm2

				movd mm2, dword ptr [ebp + edx*4]
				movq mm4, mm1

				psrlw mm0, 1
				punpcklbw mm1, mm7

				punpcklwd mm4, mm2
				punpcklbw mm2, mm7

				packuswb mm0, mm0
				paddw mm1, mm2

				pand mm0, mm6
				psrlw mm1, 1

				pand mm3, mm5
				packuswb mm1, mm1

				pand mm1, mm6
				pand mm4, mm5

				por mm0, mm3
				por mm1, mm4

				punpckldq mm0, mm1
				dec ecx

				movq qword ptr [edi], mm0
				jnz xcopyloop

			//Calculate the address of the next scanline
			add esi, [esp]			//lExtraSourcePitch
			add edi, [esp + 4]		//lExtraDestPitch

			dec ebx
			jnz ycopyloop

		//Pop lExtraSourcePitch and lExtraDestPitch off the stack
		pop eax					//lExtraSourcePitch
		pop edx					//lExtraDestPitch

		//Emtpy the MMX state
		emms

		//Epilog
	epilog:
		pop edi
		pop esi
		pop ebx
		pop ebp
		ret 28
	}
}
